Author : Sirawit N.

source : Quantium

Load required libraries

#### Load required libraries
library(data.table)
library(ggplot2)
library(ggmosaic)
library(readr)
library(glue)
library(tidyverse)
library(readxl)
library(stringr)
library(stringi)
library(lubridate)
library(glue)
library(patchwork)
library(hrbrthemes)
library(RColorBrewer)
library(viridis)

Load the data to R

df_xlsx <- read_excel(“filename.xlsx”)

df_csv <- read_csv(“filename.csv”)

Exploratory data analysis

EDA is the first step in any analysis to first understand the data. Let’s take a look at each of the datasets provided.

Observe the transaction data

transactionData %>% head(10)
colnames(transactionData)
## [1] "DATE"           "STORE_NBR"      "LYLTY_CARD_NBR" "TXN_ID"        
## [5] "PROD_NBR"       "PROD_NAME"      "PROD_QTY"       "TOT_SALES"
str(transactionData)
## tibble [264,836 × 8] (S3: tbl_df/tbl/data.frame)
##  $ DATE          : num [1:264836] 43390 43599 43605 43329 43330 ...
##  $ STORE_NBR     : num [1:264836] 1 1 1 2 2 4 4 4 5 7 ...
##  $ LYLTY_CARD_NBR: num [1:264836] 1000 1307 1343 2373 2426 ...
##  $ TXN_ID        : num [1:264836] 1 348 383 974 1038 ...
##  $ PROD_NBR      : num [1:264836] 5 66 61 69 108 57 16 24 42 52 ...
##  $ PROD_NAME     : chr [1:264836] "Natural Chip        Compny SeaSalt175g" "CCs Nacho Cheese    175g" "Smiths Crinkle Cut  Chips Chicken 170g" "Smiths Chip Thinly  S/Cream&Onion 175g" ...
##  $ PROD_QTY      : num [1:264836] 2 3 2 5 3 1 1 1 1 2 ...
##  $ TOT_SALES     : num [1:264836] 6 6.3 2.9 15 13.8 5.1 5.7 3.6 3.9 7.2 ...

Observe the purchase behaviour data

df_purc %>% head(10)
colnames(df_purc)
## [1] "LYLTY_CARD_NBR"   "LIFESTAGE"        "PREMIUM_CUSTOMER"
str(df_purc)
## spec_tbl_df [72,637 × 3] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ LYLTY_CARD_NBR  : num [1:72637] 1000 1002 1003 1004 1005 ...
##  $ LIFESTAGE       : chr [1:72637] "YOUNG SINGLES/COUPLES" "YOUNG SINGLES/COUPLES" "YOUNG FAMILIES" "OLDER SINGLES/COUPLES" ...
##  $ PREMIUM_CUSTOMER: chr [1:72637] "Premium" "Mainstream" "Budget" "Mainstream" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   LYLTY_CARD_NBR = col_double(),
##   ..   LIFESTAGE = col_character(),
##   ..   PREMIUM_CUSTOMER = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

Expect to be numeric are in numeric form and date columns are in date format. #### Examine transaction data

#### Convert DATE column to a date format 
#### CSV and Excel integer dates begin on 30 Dec 1899
transactionData$DATE <- as.Date(transactionData$DATE, origin = "1899-12-30")
transactionData

Examine PROD_NAME

text analysis by summarising the product name

transactionData %>% group_by(PROD_NAME) %>% summarise(COUNT = n()) %>% arrange(desc(COUNT))
#### Examine the words in PROD_NAME to see if there are any incorrect entries
#### such as products that are not chips
summary(transactionData$PROD_NAME)
##    Length     Class      Mode 
##    264836 character character
tdf <- transactionData %>% group_by(PROD_NAME) %>% summarise(n =n()) %>% arrange(desc(n))

tdf <- tdf[1]
tdf
productWords <- strsplit(tdf$PROD_NAME, " ")
productWords_df<-data.table(productWords)
setnames(productWords_df, 'words')

productWords_df
#removing special characters
productWords_df$words <-  str_replace_all(productWords_df$words,"[[:punct:]]"," ")
#removing digit
productWords_df$words <- str_replace_all(productWords_df$words,"[0-9]"," ")
#### Removing special characters
productWords_df$words <- str_replace_all(productWords_df$words,"[gG]"," ")
wordsplit <- strsplit(productWords_df$words," ")
# check data type
typeof(wordsplit)
## [1] "list"
### since the variable is list we have to unlist to get the result as we want
word_n <- as.data.frame(table(unlist(wordsplit)))
#### sorting them by this frequency in order of highest to lowest frequency
word_n <- word_n %>% rename(Word = Var1, n = Freq ) %>% arrange(desc(n))
word_n

use regular expression with grepl to filter the name

There are salsa products in the dataset but we are only interested in the chips category, so let’s remove these.

#### Remove salsa products
t1 <- transactionData %>% mutate(salsa = grepl("salsa", transactionData$PROD_NAME, ignore.case =T))
colnames(t1)
## [1] "DATE"           "STORE_NBR"      "LYLTY_CARD_NBR" "TXN_ID"        
## [5] "PROD_NBR"       "PROD_NAME"      "PROD_QTY"       "TOT_SALES"     
## [9] "salsa"
#filter salsa
t1 <- t1 %>% filter(salsa == 'FALSE')
summary(t1)
##       DATE              STORE_NBR     LYLTY_CARD_NBR        TXN_ID       
##  Min.   :2018-07-01   Min.   :  1.0   Min.   :   1000   Min.   :      1  
##  1st Qu.:2018-09-30   1st Qu.: 70.0   1st Qu.:  70015   1st Qu.:  67569  
##  Median :2018-12-30   Median :130.0   Median : 130367   Median : 135183  
##  Mean   :2018-12-30   Mean   :135.1   Mean   : 135531   Mean   : 135131  
##  3rd Qu.:2019-03-31   3rd Qu.:203.0   3rd Qu.: 203084   3rd Qu.: 202654  
##  Max.   :2019-06-30   Max.   :272.0   Max.   :2373711   Max.   :2415841  
##     PROD_NBR       PROD_NAME            PROD_QTY         TOT_SALES      
##  Min.   :  1.00   Length:246742      Min.   :  1.000   Min.   :  1.700  
##  1st Qu.: 26.00   Class :character   1st Qu.:  2.000   1st Qu.:  5.800  
##  Median : 53.00   Mode  :character   Median :  2.000   Median :  7.400  
##  Mean   : 56.35                      Mean   :  1.908   Mean   :  7.321  
##  3rd Qu.: 87.00                      3rd Qu.:  2.000   3rd Qu.:  8.800  
##  Max.   :114.00                      Max.   :200.000   Max.   :650.000  
##    salsa        
##  Mode :logical  
##  FALSE:246742   
##                 
##                 
##                 
## 
# now drop the salsa column to original dataframe
t1 <- t1[1:8]
colnames(t1)
## [1] "DATE"           "STORE_NBR"      "LYLTY_CARD_NBR" "TXN_ID"        
## [5] "PROD_NBR"       "PROD_NAME"      "PROD_QTY"       "TOT_SALES"

Summarise the data to check for nulls and possible outliers

#write a function to check missing value in each column
check_na <- function(col){sum(is.na(col))}
# check na through column
count_na <- apply(t1, MARGIN = 2, check_na)
count_na
##           DATE      STORE_NBR LYLTY_CARD_NBR         TXN_ID       PROD_NBR 
##              0              0              0              0              0 
##      PROD_NAME       PROD_QTY      TOT_SALES 
##              0              0              0
# There is no missing value in the dataframe now check outlier
# using summary to observe the outlier
summary(t1)
##       DATE              STORE_NBR     LYLTY_CARD_NBR        TXN_ID       
##  Min.   :2018-07-01   Min.   :  1.0   Min.   :   1000   Min.   :      1  
##  1st Qu.:2018-09-30   1st Qu.: 70.0   1st Qu.:  70015   1st Qu.:  67569  
##  Median :2018-12-30   Median :130.0   Median : 130367   Median : 135183  
##  Mean   :2018-12-30   Mean   :135.1   Mean   : 135531   Mean   : 135131  
##  3rd Qu.:2019-03-31   3rd Qu.:203.0   3rd Qu.: 203084   3rd Qu.: 202654  
##  Max.   :2019-06-30   Max.   :272.0   Max.   :2373711   Max.   :2415841  
##     PROD_NBR       PROD_NAME            PROD_QTY         TOT_SALES      
##  Min.   :  1.00   Length:246742      Min.   :  1.000   Min.   :  1.700  
##  1st Qu.: 26.00   Class :character   1st Qu.:  2.000   1st Qu.:  5.800  
##  Median : 53.00   Mode  :character   Median :  2.000   Median :  7.400  
##  Mean   : 56.35                      Mean   :  1.908   Mean   :  7.321  
##  3rd Qu.: 87.00                      3rd Qu.:  2.000   3rd Qu.:  8.800  
##  Max.   :114.00                      Max.   :200.000   Max.   :650.000

Notice that from the summary the maximun value in the PROD_QTY column is 200 where as its Mean and 3rdQu is 1.9 and 2 respectively so this might suspect to be the outliers.

#chect the outlier 
t1 %>% select(DATE,LYLTY_CARD_NBR,PROD_NAME,PROD_QTY,TOT_SALES) %>% filter(PROD_QTY >= 50)

Notice that the transaction form the customer loyalty card number 226000 had purchased “Dorito Corn Chp Supreme” with the quantity of 200 in one transaction on Date 2018-08-19 and 2019-05-20 on the same product and quantity.

#### Let's see if the customer has had other transactions
t1 %>% filter(LYLTY_CARD_NBR == 226000)

It looks like this customer has only had the two transactions over the year and is not an ordinary retail customer. The customer might be buying chips for commercial purposes instead. We’ll remove this loyalty card number from further analysis.

## remove customer with loyalty card number 226000 from the dataframe
#### Filter out the customer based on the loyalty card number
t1 <- t1 %>% filter(LYLTY_CARD_NBR != 226000) 

Re-examine transaction data

Let’s look at the number of transaction lines over time to see if there are any obvious data issues such as missing data.

#### Count the number of transactions by date
t1_cdate <- t1 %>% group_by(DATE) %>% summarise(n = n())
t1_cdate
t1 %>% group_by(DATE) %>% summarise(n = n())
n_distinct(t1$DATE)
## [1] 364

There’s only 364 rows, meaning only 364 dates which indicates a missing date. Let’s create a sequence of dates from 1 Jul 2018 to 30 Jun 2019 and use this to create a chart of number of transactions over time to find the missing date.

# find the missing date
date_range <- seq(min(t1d$DATE), max(t1d$DATE), by =1)
date_range[!date_range %in% t1d$DATE]
## [1] "2018-12-25"
#[1] "2018-12-25"
t1 %>% filter(DATE == '2018-12-25')
  • create a column of dates that includes every day from 1 Jul 2018 to 30 Jun 2019, and join it on to the data to fill in the missing day.
#### Create a sequence of dates and join this the count of transactions by date

date_df <-data.frame(DATE = seq(as.Date("2018-07-01"), as.Date("2019-06-30"), by = "day"))
## create new data frame contain missing date
t1_misdt <- date_df %>% left_join(t1)
## Joining, by = "DATE"
## now the missing date is appear in the dataa frame but still contain NA
transactions_by_day <- t1_misdt %>% group_by(DATE) %>% summarise(n = n()) %>% arrange(DATE)
transactions_by_day

Setting plot themes to format graphs

theme_set(theme_bw()) theme_update(plot.title = element_text(hjust = 0.5))

## plot include missing date 
p2<-ggplot(transactions_by_day, aes(x = DATE, y = n)) +
    geom_line() +
    labs(x="Day", y="Number of transactions", title="Transactions over time") +
    scale_x_date(breaks = "1 month") +
    theme_set(theme_bw())+theme(axis.text.x = element_text(angle = 45, vjust = 0.5))
#plot without miissig date
p1<-ggplot(t1_cdate, aes(x = DATE, y = n)) +
 geom_line() +
 labs(x="Day", y="Number of transactions", title="Transactions over time") +
 scale_x_date(breaks = "1 month") +
 theme_set(theme_bw())+theme(axis.text.x = element_text(angle = 45, vjust = 0.5))

We can see that there is an increase in purchases in December and a break in late December. Let’s zoom in on this.

Filter to December and look at individual days

dec_transac <- t1_cdate %>% 
  filter(between(t1_cdate$DATE, as.Date('2018-12-01'), as.Date('2018-12-31')))

dec_transac

recreate the chart above zoomed in to the relevant dates.

ggplot(dec_transac, aes(x = DATE, y = n)) +
 geom_line() +
 labs(x="Day", y="Number of transactions", title="Transactions in December 2018") +
 scale_x_date(breaks = "1 week") +
 theme_set(theme_bw())+
 theme(axis.text.x = element_text(angle = 45, vjust = 0.5))

We can see that the increase in sales occurs in the lead-up to Christmas and that there are zero sales on Christmas day itself. This is due to shops being closed on Christmas day.

Lets compare the transection on Jul vs Jun

jun_transac <- t1_cdate %>% filter(month == 6)

jul_transac <- t1_cdate %>% filter(month == 7)

ju1<- ggplot(jun_transac, aes(x = DATE, y = n)) +
     geom_line() +
     labs(x="Day", y="Number of transactions", title="Transactions in June 2018") +
     scale_x_date(breaks = "1 week") +
     theme_set(theme_bw())+
     theme_update(plot.title = element_text(hjust = 0.5))+
     theme(axis.text.x = element_text(angle = 45, vjust = 0.5))

jul1 <-  ggplot(jul_transac, aes(x = DATE, y = n)) +
     geom_line() +
     labs(x="Day", y="Number of transactions", title="Transactions in July 2018") +
     scale_x_date(breaks = "1 week") +
     theme_set(theme_bw())+
     theme_update(plot.title = element_text(hjust = 0.5))+
     theme(axis.text.x = element_text(angle = 45, vjust = 0.5))

Transaction July 2018 vs June 2018

Now that we are satisfied that the data no longer has outliers, we can move on to creating other features such as brand of chips or pack size from PROD_NAME. We will start with pack size.

Packaging size

We can work this out by taking the digits that are in PROD_NAME

t_pk <- transactionData %>% mutate(PACK_SIZE = parse_number(transactionData$PROD_NAME))
#### Let's check if the pack sizes look sensible 
summary(t_pk$PACK_SIZE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    70.0   150.0   170.0   182.4   175.0   380.0

The largest size is 380g and the smallest size is 70g - seems sensible!

Let’s plot a histogram of PACK_SIZE since we know that it is a categorical variable and not a continuous variable even though it is numeric.

 #histogram showing the number of transactions by pack size
ggplot(t_pk,aes(PACK_SIZE))+geom_histogram(bins = 11,fill="#48a4b2",color = 'white')+
  labs(x="Packaging size", y="Number of transactions", title="Product Packaging Size")+
  theme_minimal()

From the histogram the packaging sizes created look reasonable.

Now to create brands, we can use the first word in PROD_NAME to work out the brand name. Create a column which contains the brand of the product, by extracting it from the product name.

transactionData$Brand <- gsub(pattern="([A-Z]+).*", "\\1", 
                              transactionData$PROD_NAME,
                              ignore.case = TRUE)
transactionData %>% group_by(Brand) %>% summarise(n = n())

Some of the brand names look like they are of the same brands - such as RED and RRD, which are both Red Rock Deli chips. Let’s combine these together.

#### Clean brand names
trD <- transactionData
trD$Brand[trD$Brand == "RRD"] <-'RED'

other additional brand adjustments that required to change brand’s name.

trD$Brand[trD$Brand == "WW"] <-'Woolworths'
trD$Brand[trD$Brand == "Dorito"] <-'Doritos'
trD$Brand[trD$Brand == "Infzns"] <-'Infuzions'
trD$Brand[trD$Brand == "Smith"] <-'Smiths'
trD$Brand[trD$Brand == "Snbts"] <-'Sunbites'
trD$Brand[trD$Brand == "GrnWves"] <-'Grain'
trD$Brand[trD$Brand == "Red"] <-'RED'

trD %>%  group_by(Brand) %>% summarise(n = n())

Examining customer data

let’s have a look at the customer dataset.

#### Examining customer data, summaries of the dataset

colnames(df_purc)
## [1] "LYLTY_CARD_NBR"   "LIFESTAGE"        "PREMIUM_CUSTOMER"
df_purc %>% group_by(PREMIUM_CUSTOMER) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(df_purc, aes(PREMIUM_CUSTOMER, fill = PREMIUM_CUSTOMER))+geom_bar()

df_purc %>% group_by(LIFESTAGE) %>% ggplot(aes(LIFESTAGE, fill =LIFESTAGE))+
  geom_bar()+scale_fill_viridis(discrete=TRUE, guide=FALSE, option="D")

ggplot(df_purc, aes(PREMIUM_CUSTOMER, fill = LIFESTAGE))+
  geom_bar(position = 'fill')+scale_fill_viridis(discrete=TRUE, option="D")

ggplot(df_purc, aes(PREMIUM_CUSTOMER, fill = PREMIUM_CUSTOMER))+
  geom_bar()+facet_wrap(~LIFESTAGE, ncol = 4)+theme_minimal()+
  scale_fill_viridis(discrete=TRUE, guide=FALSE, option="D")

#### Merge transaction data to customer data
data <- merge(transactionData, df_purc, all.x = TRUE)
data

As the number of rows in data is the same as that of transactionData, we can be sure that no duplicates were created. This is because we created data by setting all.x = TRUE (in other words, a left join) which means take all the rows in transactionData and find rows with matching values in shared columns and then joining the details in these rows to the x or the first mentioned table.

Let’s also check if some customers were not matched on by checking for nulls.
# See if any transactions did not have a matched customer.
apply(data, MARGIN = 2, check_na)
##   LYLTY_CARD_NBR             DATE        STORE_NBR           TXN_ID 
##                0                0                0                0 
##         PROD_NBR        PROD_NAME         PROD_QTY        TOT_SALES 
##                0                0                0                0 
##            Brand        LIFESTAGE PREMIUM_CUSTOMER 
##                0                0                0

Great, there are no nulls! So all our customers in the transaction data has been accounted for in the customer dataset.

write this dataset into a csv file

write.csv(data,"QVI_data_clean.csv")

Data analysis on customer segments

Now that the data is ready for analysis, we can define some metrics of interest to the client:

data %>% select(LYLTY_CARD_NBR, PROD_QTY,PROD_NAME, TOT_SALES, Brand, LIFESTAGE,PREMIUM_CUSTOMER)
chip <- data %>% select(LYLTY_CARD_NBR, PROD_QTY,PROD_NAME,
                       TOT_SALES, Brand, LIFESTAGE,PREMIUM_CUSTOMER) %>%
                mutate(chips = grepl("chips*", data$PROD_NAME, ignore.case =TRUE)) %>% 
                filter(chips == TRUE)
chip %>% select(card_no = LYLTY_CARD_NBR, TOT_SALES, l_sta = LIFESTAGE,
                cus_sta = PREMIUM_CUSTOMER) %>% 
        group_by(card_no) %>% 
        mutate(chip_sales = sum(TOT_SALES)) %>%  
        summarise(l_sta, cus_sta, n_transc = n() ,chip_sales, 
                  avg_bill = round(chip_sales/n_transc,digits = 1)) %>%                                       arrange(desc(chip_sales)) %>% distinct(card_no, .keep_all = TRUE) %>% head(5)

The top 5 spenders on chip products are shown in the table above, and the most spender is a customer card no. 69154 from the older families section which had a total spending amount of 79.6 USD and 9 transactions with an average bill for chip products of 8.8 USD throughout the period we can summarise that budget customer favor purchasing chip products and only 1 customer that is a premium customer out of 5 highest spenders and all of them are in the families lifestage which 3 are older families and 2 young families.

data %>% select(card_no = LYLTY_CARD_NBR, TOT_SALES, l_sta = LIFESTAGE,
                cus_sta = PREMIUM_CUSTOMER) %>% 
        group_by(card_no) %>% 
        mutate(Total_sales = sum(TOT_SALES)) %>%  
        summarise(l_sta, cus_sta, n_transc = n() ,Total_sales, 
                  avg_bill = round(Total_sales/n_transc,digits = 1)) %>%                                       arrange(desc(Total_sales)) %>% distinct(card_no, .keep_all = TRUE) %>% head(5)

The most spenders from all product categories is in budget with older families customer section with the total bill of 138.6 USD and the number of transaction throughout the year is 17 with the average per bill of 8.2 USD.

data %>% group_by(LIFESTAGE,segment = PREMIUM_CUSTOMER) %>%
         summarise(n = n()) %>% arrange(segment, LIFESTAGE,desc(n))
ggplot(data, aes(PREMIUM_CUSTOMER, fill = LIFESTAGE))+
  geom_bar(position = 'dodge',color ='white')+ theme_minimal() +
  scale_fill_viridis(discrete=TRUE, option="D")+xlab('Customer Segment')

pctt %>% ggplot(aes(y=LIFESTAGE, x=pctt[[5]], fill = segment))+
  geom_col(position="fill", stat="identity", color = 'white')+
  theme(axis.text.x = element_text(angle = 0, vjust = 0.7))+
  labs(y='', x ='Life Stage %', title  = "Customer segmentation")+
  geom_text(aes(label=paste(pctt[[5]],'%')),position = position_fill(vjust = 0.5))+
  scale_fill_brewer(palette = "YlOrBr")

chip %>% group_by(LIFESTAGE,segment = PREMIUM_CUSTOMER) %>%
         summarise(n = n()) %>% arrange(segment,LIFESTAGE, desc(n))
#- What's the average chip price by customer segment
chip %>% group_by(Customer_segment = PREMIUM_CUSTOMER) %>%
  summarise(avg_bill = round(mean(TOT_SALES),digit = 1),
            spends = round(sum(TOT_SALES),digit = 1)) %>% 
  arrange(desc(spends))

The most sales contribution is from the mainstream segment with the average bill if 6.9 and total of 197980.7 USD

We could also ask our data team for more information. Examples are: - The customer’s total spend over the period and total spend for each transaction to understand what proportion of their grocery spend is on chips

#avg totl avg by cus
sale_g <- data %>% summarise(avg_grocery_spend = round(mean(TOT_SALES), digits = 1), 
                   sum_grocery = sum(TOT_SALES),1)

sale_c <- chip %>% summarise(avg_chip_spending = round(mean(TOT_SALES), digit = 1),
                   sum_chip = round(sum(TOT_SALES), digits = 1),1)
gro_sp_avg <- sale_pp[[1]]
gro_sp <- sale_pp[[2]]
pp_per <- round(sale_pp[[5]]/sale_pp[[2]]*100,digits = 1)
cp_sal <- sale_pp[[5]]
cp_avg <- sale_pp[[4]]


glue('The total spend on the grocery :{gro_sp} USD, average : {gro_sp_avg} USD
     The total chip sale : {cp_sal} USD, average : {cp_avg} USD 
     with {pp_per}% proportion of their grocery spend is on chips')
## The total spend on the grocery :1933115 USD, average : 7.3 USD
## The total chip sale : 511588.3 USD, average : 6.9 USD 
## with 26.5% proportion of their grocery spend is on chips
c_cust <-chip %>% distinct(LYLTY_CARD_NBR) %>% summarise(purchase_chips = n())
groc_cust <- data %>% distinct(LYLTY_CARD_NBR) %>% summarise(purchase_grocery =n())

prop_c_g <- round((c_cust[[1]]/groc_cust[[1]])*100,digit=1)
glue('The number of customer that purchase chip is {c_cust} and over all number of customer is {groc_cust} and the proportion of the grocery to chip customer is {prop_c_g}%')  
## The number of customer that purchase chip is 43625 and over all number of customer is 72636 and the proportion of the grocery to chip customer is 60.1%

Average number of units per customer by LIFESTAGE and PREMIUM_CUSTOMER

Calculate and plot the average number of units per customer by those two dimensions.

unt_p_cust <- data %>% group_by(LIFESTAGE, PREMIUM_CUSTOMER) %>% 
    summarise(unit_per_cust = round(sum(PROD_QTY)/uniqueN(LYLTY_CARD_NBR),digits = 1))
ggplot(unt_p_cust,aes(weight = unit_per_cust, x =LIFESTAGE,
                      fill = PREMIUM_CUSTOMER))+
     geom_bar(position = 'dodge',color = 'black')+theme_minimal()+
     theme(axis.text.x = element_text(angle = 45, vjust = 0.5))+
     scale_fill_viridis(discrete=TRUE, option="G",guide="none")+
     labs(y ="AVG Unit Per Transaction",x = "", title = "Average Units Per Customer")

Average price per unit by LIFESTAGE and PREMIUM_CUSTOMER

# (average sale price) by those two customer dimensions
sle_p_unt <- data %>% group_by(LIFESTAGE, PREMIUM_CUSTOMER) %>% 
    summarise(sl_per_unt = round(sum(TOT_SALES)/sum(PROD_QTY),digits = 1))
sle_p_unt
ggplot(sle_p_unt,aes(weight = sl_per_unt, x =LIFESTAGE,
                      fill = PREMIUM_CUSTOMER))+
     geom_bar(position = 'dodge',color = 'black')+theme_minimal()+
     theme(axis.text.x = element_text(angle = 45, vjust = 0.5))+
     scale_fill_brewer(palette = "YlGnBu")+
     labs(y ="AVG Price Per Unit",x = "", title = "Average Sale Price Per Unit")

Sirawit N. credit : Quantium